package com.yj.auto.plugin.lucene.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OLE2NotOfficeXmlFileException;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jsoup.Jsoup;

import com.jfinal.log.Log;
import com.xiaoleilu.hutool.io.FileUtil;
import com.yj.auto.Constants;

public class DocumentUtil {
	private static final Log logger = Log.getLog(DocumentUtil.class);
	/**
	 * 提取TXT文档
	 * 
	 * @param filePath
	 *            文件路径
	 * @param charset
	 *            编码
	 * @return
	 * @throws Exception
	 */
	public static String txt2String(String filePath, String charset) throws Exception {
		String text = null;
		try {
			text = FileUtil.readString(new File(filePath), charset);
		} catch (Exception e) {
			logger.error("text to string error:" + filePath);
			throw e;
		}
		return text;
	}

	public static String txt2String(String filePath) throws Exception {
		return txt2String(filePath, Constants.SYS_ENCODING);
	}

	/**
	 * 提取word文档
	 * 
	 * @param filePath
	 *            文件路径
	 * @return
	 * @throws Exception
	 */

	public static String word2String(String filePath) throws Exception {
		String text = null;
		InputStream is = null;
		POITextExtractor extractor = null;
		try { // office 2007 +
			is = new FileInputStream(filePath);
			try {
				XWPFDocument doc = new XWPFDocument(is);
				extractor = new XWPFWordExtractor(doc);
			} catch (OLE2NotOfficeXmlFileException e) {// office 2003 -
				extractor = new WordExtractor(is);
			}
			text = extractor.getText();
		} catch (IOException e) {
			logger.error("word to string error:" + filePath);
			throw e;
		} finally {
			if (extractor != null) {
				try {
					extractor.close();
				} catch (IOException e) {
				}
			}
			if (is != null) {
				try {
					is.close();
				} catch (IOException e) {
				}
			}
		}
		return text;
	}

	/**
	 * 提取excel文档
	 * 
	 * @param filePath
	 *            文件路径
	 * @return
	 * @throws Exception
	 */

	public static String excel2String(String filePath) throws Exception {
		String text = null;
		InputStream is = null;
		POITextExtractor extractor = null;
		try {
			is = new FileInputStream(filePath);
			Workbook book = WorkbookFactory.create(is);
			if (book instanceof XSSFWorkbook) {// office 2007+
				extractor = new XSSFExcelExtractor((XSSFWorkbook) book);
			} else {
				extractor = new org.apache.poi.hssf.extractor.ExcelExtractor((HSSFWorkbook) book);
			}
			text = extractor.getText();
		} catch (IOException e) {
			logger.error("excel to string error:" + filePath);
			throw e;
		} finally {
			if (extractor != null) {
				try {
					extractor.close();
				} catch (IOException e) {
				}
			}
			if (is != null) {
				try {
					is.close();
				} catch (IOException e) {
				}
			}
		}
		return text;
	}

	/**
	 * 提取ppt文档
	 * 
	 * @param filePath
	 *            文件路径
	 * @return
	 * @throws Exception
	 */

	public static String ppt2String(String filePath) throws Exception {
		String text = null;
		XSLFPowerPointExtractor extractor = null;
		try {
			XSLFSlideShow doc = new XSLFSlideShow(filePath);
			extractor = new XSLFPowerPointExtractor(doc);
			text = extractor.getText();
		} catch (Exception e) {
			logger.error("ppt to string error:" + filePath);
			throw e;
		} finally {
			if (extractor != null) {
				try {
					extractor.close();
				} catch (IOException e) {
				}
			}
		}
		return text;
	}

	/**
	 * 提取pdf文档
	 * 
	 * @param filePath
	 *            文件路径
	 * @return
	 * @throws Exception
	 */
	public static String pdf2String(String filePath) throws Exception {
		String text = null;
		PDDocument doc = null;
		try {
			File file = new File(filePath);
			if (!file.exists())
				return null;
			doc = PDDocument.load(file);
			PDFTextStripper stripper = new PDFTextStripper();
			text = stripper.getText(doc);
		} catch (Exception e) {
			logger.error("ppt to string error:" + filePath);
			throw e;
		} finally {
			if (doc != null) {
				try {
					doc.close();
				} catch (IOException e) {
				}
			}
		}
		return text;
	}

	/**
	 * 提取html文档
	 * 
	 * @param filePath
	 *            文件路径
	 * @param charset
	 *            编码
	 * @return
	 * @throws Exception
	 */
	public static String html2String(String filePath, String charset) throws Exception {
		String text = null;
		try {
			File file = new File(filePath);
			if (!file.exists())
				return null;
			org.jsoup.nodes.Document doc = Jsoup.parse(file, charset);
			text = doc.text();
		} catch (Exception e) {
			logger.error("html to string error:" + filePath);
			throw e;
		}
		return text;
	}

	public static String html2String(String filePath) throws Exception {
		return html2String(filePath, Constants.SYS_ENCODING);
	}
}
